
#############################################################
### Geographic Representativeness of Legislatures Project ###
###                                                       ###
###           Leonardo Carella, Andrew Eggers             ###
#############################################################

# This script replicates the within-country analysis presented in
# section 5.1 of the paper: the Italian case study.
# First, it generates gridded population estimates for Italy
# in the required years, using HYDE 3.2 data for 1983-2001 elections
# and Gridded Population of the World (GWP v4) data for 2006-2022. 
# Then it loads data on Italian MPs' birthplaces and produces descriptive
# statistics on ‘parachuters’. Finally, it draws on the grids datasets and 
# the birthplace data to compute SURLI and EMD scores for relevant legislatures/tiers.

# The code requires three datasets as inputs:
# 1. the population_histsoc_5min_annual_1861-2005.nc dataset (HYDE 3.2)
# 2. the gpw_v4_population_count_rev11_2pt5_min.nc dataset (GWP v4)
# 3. the ita_mps_10112022.rds dataset, containing information on Italian MPs

# The code produces as outputs:
# 1. 11 RDS files named "grids_italy_XXXX.rds" where XXXX is the population-year of reference;
# these contain the population estimates for 15-arcmin grids.
# 2. 21 RDS files named "surli_data_italy_Y_ZZZZ_.rds", where Y is the legislature number 
# and ZZZZ is the electoral system/tier of reference (Open List, Closed List, Mixed-Member Combined)
# these contain the real and fake legislature (or tier, for MXM systems) discrepancy estimates.

# These datasets are subsequently re-loaded in the script to proceed with the analysis. 

#### Packages ####

`%notin%` <- Negate(`%in%`) #for wrangling

library(sf)
library(tidyverse)
library(sp)
library(ggmap)
library(haven)
library(stringi)
library(readxl)
library(raster)
library(mapdata)
library(Rmisc)
library(gridExtra)
library(rgdal)
library(geosphere)
library(Matrix)
library(parallel)
library(move)
library(patchwork)
library(ncdf4)
library(xtable)

#### Get 2000-2005-2010-2015-2020 population grids for Italy from GWP ####

lookup <- cbind.data.frame(layer = 1:5, year = c(2000, 2005, 2010, 2015, 2020))

for (i in 1:5)  {
  
  popdata <- brick("gpw_v4_population_count_rev11_2pt5_min.nc")
  subset_grids_original <- subset(popdata, lookup$layer[i])
  subset_grids_original <- raster::aggregate(subset_grids_original, fact = 2, fun=sum)
  raster::writeRaster(subset_grids_original, 
                      filename=paste("gpw_",lookup$year[i], ".asc", sep = ""),
                      format = "ascii", datatype='INT4S', overwrite=TRUE)
  subset_grids <- as.data.frame(read.asciigrid(paste("gpw_",lookup$year[i], ".asc", sep = "")))
  subset_grids$Country <- map.where(database = "world", subset_grids$s1, subset_grids$s2)
  subset_grids <- as.data.frame(separate(subset_grids, Country, into = c("CountryNAME", "Other_Location"), sep = ":", fill = "right", extra = "drop"))
  
  grids_italy <- subset(subset_grids, subset_grids$Country == "Italy")
  colnames(grids_italy)[1] <- "pop"
  grids_italy2 <- grids_italy %>%
    dplyr::select(s1, s2, pop)
  raster_italy <- rasterFromXYZ(grids_italy2)
  writeRaster(raster_italy, filename="raster_italy.nc", format="CDF", overwrite=TRUE)   
  raster_italy <- stack("raster_italy.nc") 
  raster_italy2 <- raster::aggregate(raster_italy, fact = 3, fun=sum)
  raster_italy2 <- subset(raster_italy2, 1)
  writeRaster(raster_italy2, filename="raster_italy2.asc", 
              format = "ascii", datatype='INT4S', overwrite=TRUE)
  raster_italy3 <- as.data.frame(read.asciigrid("raster_italy2.asc")) %>%
    dplyr::filter(raster_italy2.asc > 0) %>%
    dplyr::rename(pop = raster_italy2.asc)
  write_rds(raster_italy3, paste("grids_italy_", lookup$year[i], ".rds", sep = ""))
  
  print(paste("Completed Italy for", lookup$year[i]))
  
}

#### Get 1983-1987-1992-1994-1996-2001 population grids from HYDE3.2  ####

grid_size_parameter <- 15
countries_parameter <- c("Italy")
for (year_parameter in c(1983, 1987, 1992, 1994, 1996, 2001)) {
  print(year_parameter)
  popdata <- brick("population_histsoc_5min_annual_1861-2005.nc") 
  subset_grids_original <- subset(popdata, year_parameter-1860)
  #save it as an ascii file, convert to long-format dataset
  writeRaster(subset_grids_original, filename="subset_grids_original.asc", format = "ascii", datatype='INT4S', overwrite=TRUE)
  subset_grids <- as.data.frame(read.asciigrid("subset_grids_original.asc"))
  #associate a country with each (small) grid centroid
  subset_grids$Country <- map.where(database = "world", subset_grids$s1, subset_grids$s2)
  subset_grids <- as.data.frame(separate(subset_grids, Country, into = c("CountryNAME", "Other_Location"), sep = ":", fill = "right", extra = "drop"))
  countryname <- unique(subset_grids$CountryNAME)
  countryfactor <- unique(as.numeric(as.factor(subset_grids$CountryNAME)))
  lookup <- cbind.data.frame(countryname, countryfactor) %>%
    dplyr::filter(!is.na(countryname))
  #create a multi-layer raster with one layer for population 
  # and one layer for country code
  xyz <- rasterToPoints(subset_grids_original)
  xyzz <- cbind.data.frame(xyz,country = as.numeric(as.factor(subset_grids$CountryNAME)), stringsAsFactors = F)
  master_raster <- rasterFromXYZ(xyzz)
  #save it and stack it
  writeRaster(master_raster, filename="master_raster.nc", format="CDF", overwrite=TRUE)   
  master_raster <- stack("master_raster.nc") 
  
  grids_dataframe <- data.frame(pop = NA, s1 = NA, s2 = NA, country = NA)
  system.time(for (i in countries_parameter) {
    out_vec <- get_grids(i, grid_size_parameter/5)
    out_vec$country <- paste(i)
    grids_dataframe <- rbind(grids_dataframe,out_vec) %>%
      dplyr::filter(!is.na(pop))
    print(paste("Completed", i, sep = " "))
  })
  
  #### save resulting dataset as a file in the format grids_year_gridsize.rds, with grid IDs, year and gridsize information ####
  grids_dataframe$gID <- 1:length(grids_dataframe$pop)
  grids_dataframe$year <- year_parameter
  grids_dataframe$gridsize <- grid_size_parameter
  grids_dataframe <- grids_dataframe %>%
    dplyr::select(gID, pop, s1, s2, country, gridsize, year)
  
  ita_data <- grids_dataframe %>% dplyr::filter(country == "Italy")
  write_rds(ita_data, paste("grids_italy_", year_parameter, ".rds", sep = ""))
  
  #### remove files created in the process ####
  file.remove("subset_grids_original.asc")
  file.remove("master_raster.nc")
  file.remove("country_raster.asc")}

#### Merge gridded population datasets for each election-year ####

italy_grids <- rbind.data.frame(readRDS("grids_italy_1983.rds") %>%
                                  dplyr::mutate(legislature = 9),
                                readRDS("grids_italy_1987.rds")%>%
                                  dplyr::mutate(legislature = 10),
                                readRDS("grids_italy_1992.rds")%>%
                                  dplyr::mutate(legislature = 11),
                                readRDS("grids_italy_1994.rds")%>%
                                  dplyr::mutate(legislature = 12),
                                readRDS("grids_italy_1996.rds")%>%
                                  dplyr::mutate(legislature = 13),
                                readRDS("grids_italy_2001.rds")%>%
                                  dplyr::mutate(legislature = 14),
                                readRDS("grids_italy_2005.rds")%>%
                                  dplyr::mutate(legislature = 15, year = 2005, gID = 1:n(), country = "Italy", gridsize = 15)%>%
                                  dplyr::select(gID, pop, s1, s2, country, gridsize, year, legislature),
                                readRDS("grids_italy_2010.rds")%>%
                                  dplyr::mutate(legislature = 16, year = 2010, gID = 1:n(), country = "Italy", gridsize = 15)%>%
                                  dplyr::select(gID, pop, s1, s2, country, gridsize, year, legislature),
                                readRDS("grids_italy_2015.rds")%>%
                                  dplyr::mutate(legislature = 17, year = 2015, gID = 1:n(), country = "Italy", gridsize = 15)%>%
                                  dplyr::select(gID, pop, s1, s2, country, gridsize, year, legislature),
                                readRDS("grids_italy_2015.rds")%>%
                                  dplyr::mutate(legislature = 18, year = 2015, gID = 1:n(), country = "Italy", gridsize = 15)%>%
                                  dplyr::select(gID, pop, s1, s2, country, gridsize, year, legislature),
                                readRDS("grids_italy_2020.rds")%>%
                                  dplyr::mutate(legislature = 19, year = 2020, gID = 1:n(), country = "Italy", gridsize = 15)%>%
                                  dplyr::select(gID, pop, s1, s2, country, gridsize, year, legislature))

#### Descriptive Statistics (‘Share of Parachuters’ only) ####

ita_mps <- readRDS("ita_mps_10112022.rds") %>%
  dplyr::filter(elected_abroad == 0)

ita_mps$born_in_region <- NA
ita_mps$born_in_region <- ifelse(ita_mps$region_district == ita_mps$region_birth,1,0)
ita_mps$born_in_region[!is.na(ita_mps$bornabroad)] <- 0

data <- ita_mps %>%
  dplyr::group_by(legislature, tipoElezione) %>%
  dplyr::summarise(share_bir = mean(born_in_region, na.rm = T))

combined <- ita_mps %>%
  dplyr::filter(legislature %in% c(12:14,18,19)) %>%
  dplyr::group_by(legislature) %>%
  dplyr::summarise(share_bir = mean(born_in_region, na.rm = T)) %>%
  dplyr::mutate(tipoElezione = "misto") %>%
  dplyr::select(legislature, tipoElezione, share_bir)

data <- rbind.data.frame(data, combined)

data$system <- NA
data$system[data$tipoElezione == "proporzionale" & data$legislature %in% c(9:11)] <- "Open List PR"
data$system[data$tipoElezione == "proporzionale" & data$legislature %in% c(12:19)] <- "Closed List PR"
data$system[data$tipoElezione == "maggioritario"] <- "SMD"
data$system[data$tipoElezione == "misto"] <- "MXM Combined"

data$year[data$legislature == 9] <- 1983
data$year[data$legislature == 10] <- 1987
data$year[data$legislature == 11] <- 1992
data$year[data$legislature == 12] <- 1994
data$year[data$legislature == 13] <- 1996
data$year[data$legislature == 14] <- 2001
data$year[data$legislature == 15] <- 2006
data$year[data$legislature == 16] <- 2008
data$year[data$legislature == 17] <- 2013
data$year[data$legislature == 18] <- 2018
data$year[data$legislature == 19] <- 2022

data$system2 <- data$system  
data$system2[data$legislature %in% c(18,19) & data$system == "SMD"] <- "SMD2"
data$system2[data$legislature %in% c(18,19) & data$system == "MXM Combined"] <- "MXM Combined2"

# generates top panel of figure 5 (share born outside region)

my_palette <- c("blue", "tomato", "orange", "violet")
born_outside_graph <- ggplot(data = data, mapping = aes(x = year, y = 1-share_bir)) +
  geom_point(mapping = aes(col = system, group = system, shape = system, fill = system)) + 
  geom_line(mapping = aes(col = system, group = system2)) +
  scale_color_manual(values = my_palette, name = "Electoral Rules") +
  scale_fill_manual(values = my_palette, name = "Electoral Rules") +
  scale_shape_manual(values = c(24,21,22,25), name = "Electoral Rules") + 
  xlab("Election Year") + 
  ylab("% Born outside \n Region of Election") + 
  ggtitle("Share of ‘parachuters’ (MPs born outside \n the region of election; Italy, 1983-2022)") + 
  theme_minimal() + 
  geom_vline(xintercept = 1993, lty = 2, lwd = 0.3) + 
  geom_vline(xintercept = 2005, lty = 2, lwd = 0.3) + 
  geom_vline(xintercept = 2017, lty = 2, lwd = 0.3) + 
  geom_text(mapping = aes(x = 1987, y = 0.125), label = "MTM,\nPV") + 
  geom_text(mapping = aes(x = 1999, y = 0.125), label = "MXM,\n no PV") + 
  geom_text(mapping = aes(x = 2011, y = 0.125), label = "MTM,\n no PV") + 
  geom_text(mapping = aes(x = 2021, y = 0.125), label = "MXM,\nno PV") + 
  scale_x_continuous(breaks = c(1993, 2005,2017),
                     limits = c(1982,2024)) + 
  theme(plot.title = element_text(hjust = 0.5)) + 
  scale_y_continuous(labels = scales::percent, limits = c(0.1,0.4))

ita_mps$system2 <- NA
ita_mps$system2[ita_mps$tipoElezione == "proporzionale" & 
                  ita_mps$legislature %in% c(9:11)] <- "Open List PR"
ita_mps$system2[ita_mps$tipoElezione == "proporzionale" & 
                  ita_mps$legislature %in% c(12:14, 18,19)] <- "Closed List PR in MMM"
ita_mps$system2[ita_mps$tipoElezione == "maggioritario"] <- "SMD in MMM"
ita_mps$system2[ita_mps$tipoElezione == "proporzionale" & 
                  ita_mps$legislature %in% c(15:17)] <- "Closed List PR"

# generates ‘mean share of parachuters’ column of table 5

tab_ita <- rbind.data.frame(ita_mps %>%
                              dplyr::group_by(system2) %>%
                              dplyr::summarise(share_bir = mean(born_in_region, na.rm = T),
                                               share_bor = 1- mean(born_in_region, na.rm = T)),
                            ita_mps %>% dplyr::filter(legislature %in% c(12:14, 18,19)) %>%
                              dplyr::mutate(system2 = "MXM Combined") %>%
                              group_by(system2) %>%
                              dplyr::summarise(share_bir = mean(born_in_region, na.rm = T),
                                               share_bor = 1- mean(born_in_region, na.rm = T)))

#### Compute SURLI for every tier-year combination ####

ita_mps <- ita_mps %>%
  dplyr::filter(is.na(bornabroad))
#only Italian-born considered for the rest of the analysis.

lookup <- ita_mps %>% dplyr::group_by(legislature, system) %>% dplyr::summarise()
lookup <- rbind.data.frame(lookup,
                           cbind.data.frame(legislature = c(12,13,14,18,19), 
                                            system = "MXM Combined"))

# Get real and fake discrepancy estimates (RCDF method) for each tier-year
for (n in 1:nrow(lookup)){
  
  if(lookup$system[n] != "MXM Combined") {
    df1 <- ita_mps %>% 
      dplyr::filter(legislature == lookup$legislature[n]) %>%
      dplyr::filter(system == lookup$system[n])
  }
  
  if(lookup$system[n] == "MXM Combined") {
    df1 <- ita_mps %>% 
      dplyr::filter(legislature == lookup$legislature[n])}
  
  df2 <- italy_grids %>%
    dplyr::filter(legislature == lookup$legislature[n])
  
  bpmatrix <- as.matrix(df1$bpcoords)
  gmatrix <- as.matrix(cbind(as.numeric(df2$s1), as.numeric(df2$s2)), na.omit == T)
  a <- pointDistance(bpmatrix, gmatrix, lonlat=TRUE, allpairs = TRUE) #added the allpairs parameter: relevant when number of grids = number of MPs.
  b <- apply(a, 1, which.min)
  k <- unlist(b)
  p <- data.frame(df2$gID[k], df2$s1[k], df2$s2[k], df1$person_title)
  countdf <- as.data.frame(table(p$df2.gID.k.))
  countdf$gID <- countdf$Var1 
  output <- merge(df2, countdf, by = "gID", all = TRUE)
  output$Freq[is.na(output$Freq)] <- 0
  output$popshare <- output$pop/sum(output$pop)
  output$MPshare <- output$Freq/sum(output$Freq)
  popsp <- SpatialPointsDataFrame(coords = cbind(output$s1, output$s2), data = data.frame(output$popshare))
  mpsp <- SpatialPointsDataFrame(coords = cbind(output$s1, output$s2), data = data.frame(output$MPshare))
  df1 <- as.data.frame(popsp)
  df2 <- as.data.frame(mpsp)
  grid_coords <- as.data.frame(df1[-1])
  sig1 <- df1[,1]
  sig2 <- df2[,1]
  real_rcdf <- rotated_1d_cdf_discrepancy(grid_coords, sig1, sig2, increments = 4, 
                                          recenter_at = "mean_of_means", normalize = F, raw_data = F)
  
  fake_rcdfs <- c()
  for(seed in 1:500){
    set.seed(seed)
    x <- as.data.frame(table(sample(length(output$gID), sum(output$Freq), 
                                    prob = (output$pop), replace = TRUE)))
    names(x)[names(x) == "Var1"] <- "gIDnew"
    names(x)[names(x) == "Freq"] <- "simulFreq"
    output$gIDnew <- 1:length(output$gID)
    simulOUTPUT <- merge(output, x, by = "gIDnew", all = TRUE)
    simulOUTPUT$simulFreq[is.na(simulOUTPUT$simulFreq)] <- 0
    simulOUTPUT$popshare <- simulOUTPUT$pop/sum(simulOUTPUT$pop)
    simulOUTPUT$MPshare <- simulOUTPUT$simulFreq/sum(simulOUTPUT$simulFreq)
    #computes EMDs
    popsp <- SpatialPointsDataFrame(coords = cbind(simulOUTPUT$s1, simulOUTPUT$s2), data = data.frame(simulOUTPUT$popshare))
    mpsp <- SpatialPointsDataFrame(coords = cbind(simulOUTPUT$s1, simulOUTPUT$s2), data = data.frame(simulOUTPUT$MPshare))
    df1 <- as.data.frame(popsp)
    df2 <- as.data.frame(mpsp)
    grid_coords <- as.data.frame(df1[-1])
    sig1 <- df1[,1]
    sig2 <- df2[,1]
    rcdf_simul <- rotated_1d_cdf_discrepancy(grid_coords, sig1, sig2, increments = 4, recenter_at = "mean_of_means", normalize = F, raw_data = F)
    fake_rcdfs <- rbind(fake_rcdfs, rcdf_simul)
  }
  
  fake_rcdfs <- data.frame(fake_rcdfs)
  fake_rcdfs$real_rcdf <- real_rcdf
  write_rds(fake_rcdfs, gsub(" ", "_", paste("surli_data_italy", lookup$legislature[n], 
                                             lookup$system[n], ".rds", sep = "_" )))
  
  print(paste("Completed", lookup$legislature[n], "legislature", lookup$system[n], "tier" ))
}

# Merge datasets and compute SURLI as ratio of real EMD and mean(fake EMDs).

ex <- rbind.data.frame(
  read_rds("surli_data_italy_9_Open_List_PR_.rds") %>%
    dplyr::mutate(system = "Open List PR", year = 1983) %>%
    dplyr::mutate(surli = (mean(real_rcdf)/mean(fake_rcdfs))),
  read_rds("surli_data_italy_10_Open_List_PR_.rds")%>%
    dplyr::mutate(system = "Open List PR", year = 1987) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_11_Open_List_PR_.rds")%>%
    dplyr::mutate(system = "Open List PR", year = 1992) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_12_Closed_List_PR_.rds")%>%
    dplyr::mutate(system = "Closed List PR", year = 1994) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_12_SMD_.rds")%>%
    dplyr::mutate(system = "SMD", year = 1994) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_13_Closed_List_PR_.rds")%>%
    dplyr::mutate(system = "Closed List PR", year = 1996) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_13_SMD_.rds")%>%
    dplyr::mutate(system = "SMD", year = 1996) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_14_Closed_List_PR_.rds")%>%
    dplyr::mutate(system = "Closed List PR", year = 2001) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_14_SMD_.rds")%>%
    dplyr::mutate(system = "SMD", year = 2001) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_15_Closed_List_PR_.rds")%>%
    dplyr::mutate(system = "Closed List PR", year = 2006) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_16_Closed_List_PR_.rds")%>%
    dplyr::mutate(system = "Closed List PR", year = 2008) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_17_Closed_List_PR_.rds")%>%
    dplyr::mutate(system = "Closed List PR", year = 2013) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_18_Closed_List_PR_.rds")%>%
    dplyr::mutate(system = "Closed List PR", year = 2018) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_18_SMD_.rds")%>%
    dplyr::mutate(system = "SMD", year = 2018) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_12_MXM_Combined_.rds")%>%
    dplyr::mutate(system = "MXM Combined", year = 1994) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_13_MXM_Combined_.rds")%>%
    dplyr::mutate(system = "MXM Combined", year = 1996) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_14_MXM_Combined_.rds")%>%
    dplyr::mutate(system = "MXM Combined", year = 2001) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_18_MXM_Combined_.rds")%>%
    dplyr::mutate(system = "MXM Combined", year = 2018) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_19_MXM_Combined_.rds")%>%
    dplyr::mutate(system = "MXM Combined", year = 2022) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_19_SMD_.rds")%>%
    dplyr::mutate(system = "SMD", year = 2022) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs)),
  read_rds("surli_data_italy_19_Closed_List_PR_.rds")%>%
    dplyr::mutate(system = "Closed List PR", year = 2022) %>%
    dplyr::mutate(surli = mean(real_rcdf)/mean(fake_rcdfs))) %>%
  dplyr::group_by(year, system) %>%
  dplyr::summarise(surli = mean(surli), real_rcdf = mean(real_rcdf))

#### Visualisation (Figure 5) ####

# middle panel of figure 5: ‘raw’ EMD score (= real RCDF estimate of discrepancy)
ex$system2 <- ex$system
ex$system2[ex$system == "SMD" & ex$year %in% c(2018,2022)] <- "SMD2"
ex$system2[ex$system == "MXM Combined" & ex$year %in% c(2018,2022)] <- "MXM Combined 2"

emd_graph <- ggplot(data = ex, mapping = aes(x = year, y = real_rcdf)) +
  geom_point(mapping = aes(col = system, group = system, shape = system, fill = system)) + 
  geom_line(mapping = aes(col = system, group = system2)) +
  scale_color_manual(values = my_palette, name = "Electoral Rules") +
  scale_fill_manual(values = my_palette, name = "Electoral Rules") +
  scale_shape_manual(values = c(24,21,22,25), name = "Electoral Rules") + 
  xlab("Election Year") + 
  ylab("EMD score") + 
  ggtitle("EMD scores (Italy, 1983-2022)") + 
  theme_minimal() + 
  geom_vline(xintercept = 1993, lty = 2, lwd = 0.3) + 
  geom_vline(xintercept = 2005, lty = 2, lwd = 0.3) + 
  geom_vline(xintercept = 2017, lty = 2, lwd = 0.3) + 
  geom_text(mapping = aes(x = 1987, y = 0.05), label = "MTM,\nPV") + 
  geom_text(mapping = aes(x = 1999, y = 0.05), label = "MXM,\n no PV") + 
  geom_text(mapping = aes(x = 2011, y = 0.05), label = "MTM,\n no PV") + 
  geom_text(mapping = aes(x = 2021, y = 0.05), label = "MXM,\nno PV") + 
  scale_x_continuous(breaks = c(1993, 2005,2017),
                     limits = c(1982,2024)) + 
  theme(plot.title = element_text(hjust = 0.5)) + 
  scale_y_continuous(limits = c(0,0.5))

# top panel of figure 5: SURLI
surli_graph <- ggplot(data = ex, mapping = aes(x = year, y = surli)) +
  geom_point(mapping = aes(col = system, group = system, shape = system, fill = system)) + 
  geom_line(mapping = aes(col = system, group = system2)) +
  scale_color_manual(values = my_palette, name = "Electoral Rules") +
  scale_fill_manual(values = my_palette, name = "Electoral Rules") +
  scale_shape_manual(values = c(24,21,22,25), name = "Electoral Rules") + 
  xlab("Election Year") + 
  ylab("SURLI") + 
  ggtitle("SURLI scores (Italy, 1983-2022)") + 
  theme_minimal() + 
  geom_vline(xintercept = 1993, lty = 2, lwd = 0.3) + 
  geom_vline(xintercept = 2005, lty = 2, lwd = 0.3) + 
  geom_vline(xintercept = 2017, lty = 2, lwd = 0.3) + 
  geom_text(mapping = aes(x = 1987, y = 0.25), label = "MTM,\nPV") + 
  geom_text(mapping = aes(x = 1999, y = 0.25), label = "MXM,\n no PV") + 
  geom_text(mapping = aes(x = 2011, y = 0.25), label = "MTM,\n no PV") + 
  geom_text(mapping = aes(x = 2021, y = 0.25), label = "MXM,\nno PV") + 
  scale_x_continuous(breaks = c(1993, 2005,2017),
                     limits = c(1982,2024)) + 
  theme(plot.title = element_text(hjust = 0.5)) + 
  scale_y_continuous(limits = c(0,3.5))

# produce and save figure 5
(born_outside_graph) /
  (emd_graph) /
  (surli_graph)

ggsave("italy_case_study_plots.png", width = 7.5, height = 10)

#### Descriptive Statistics (EMD, SURLI) and Table 5 ####

# compute mean EMD, SURLI under different systems
ex$system3 <- ex$system
ex$system3[ex$system == "Closed List PR" & ex$year %notin% c(1994:2001, 2018, 2022)] <- "Closed List PR"
ex$system3[ex$system == "Closed List PR" & ex$year %in% c(1994:2001, 2018, 2022)] <- "Closed List PR in MMM"
ex$system3[ex$system == "SMD"] <- "SMD in MMM"
ex$system3[ex$system == "Open List"] <- "Open List PR"

tab_ita2 <- ex %>% dplyr::group_by(system3) %>%
  dplyr::summarise(mean_emd = mean(real_rcdf),
                   mean_surli = mean(surli))

# produce table 5
tab_ita_final <- left_join(tab_ita %>% dplyr::select(system2, share_bor) %>% 
                             dplyr::rename(system3 = system2), tab_ita2)
print.xtable(xtable(tab_ita_final, digits = 3), include.rownames = FALSE)
